Example run ofexisting GSD implementation
GSD - Geneset distances Annekathrin Ludt - Institute of Medical Biostatistics, Epidemiology and Informatics (IMBEI) - anneludt@uni-mainz.de University Medical Center - Mainz (Germany)
library("igraph")
library("utils")
library("DT")
library("Matrix")
library("dplyr")
library("pheatmap")
library("taxize")
library("BiocFileCache")
library("STRINGdb")
In the first step, we read in the data. The data is expected to be in a tab-seperated format, containing two columns: 1. a column with the name of the genesets and 2. a column with the genes contained in the geneset separated by comma.
getGenes <- function(genesets){
if(length(genesets) == 0){
return(list())
}
genesets <- lapply(1:nrow(genesets), function(i) {
toupper(strsplit(genesets[i, 1], " ")[[1]])
})
return(genesets)
}
genesets <- utils::read.delim(
"sample_geneset.txt",
header = TRUE,
as.is = TRUE,
sep = "\t",
quote = "",
row.names = 1,
check.names = FALSE
)
DT::datatable(genesets)
geneset_names <- seq(1, length(rownames(genesets)))
genes <- getGenes(genesets)
After the data has been read in, it can be scored. For this, we currently provide three scores.
getMeetMinMatrix <- function(genesets, geneset_names){
l <- length(genesets)
if(l == 0){
return(-1)
}
m <- Matrix::Matrix(0, l, l)
for(i in 1:(l - 1)){
a <- genesets[i]
for(j in (i+1):l){
b <- genesets[j]
int <- length(intersect(a, b))
m[i, j] <- m[j, i] <- 1 - (int / min(length(a), length(b)))
}
}
rownames(m) <- geneset_names
colnames(m) <- geneset_names
return(m)
}
calculateKappa <- function(a, b, all_genes){
set_int <- length(intersect(a, b))
l <- length(all_genes)
only_a <- sum(all_genes %in% a & !all_genes %in% b)
only_b <- sum(!all_genes %in% a & all_genes %in% b)
background <- l - sum(only_a, only_b, set_int)
O <- (set_int + background) / l
E <- (set_int + only_a) * (set_int + only_b) + (only_b + background) * (only_a + background)
E <- E / l^2
kappa <- ((O-E) / (1-E))
if(is.nan(kappa)){
kappa <- 0
}
return(min(abs(1 - kappa), 1))
}
getKappaMatrix <- function(genesets, geneset_names){
l <- length(genesets)
if(l == 0){
return(-1)
}
k <- Matrix::Matrix(0, l, l)
unique_genes <- unique(unlist(genesets))
for(i in 1:(l - 1)){
a <- unlist(genesets[i])
for(j in (i+1):l){
b <- unlist(genesets[j])
k[i, j] <- k[j, i] <- calculateKappa(a, b, unique_genes)
}
}
rownames(k) <- geneset_names
colnames(k) <- geneset_names
return(k)
}
mmDist <- getMeetMinMatrix(genes, geneset_names)
kappaDist <- getKappaMatrix(genes, geneset_names)
pheatmap::pheatmap(mmDist, show_rownames = F, show_colnames = F)
pheatmap::pheatmap(kappaDist, show_rownames = F, show_colnames = F)
Next, we plot a graph of close genesets.
getAdjacencyMatrix <- function(distanceMatrix, cutOff){
l <- nrow(distanceMatrix)
adjMat <- Matrix::Matrix(0, l, l)
for(i in 1:l){
edge <- which(distanceMatrix[i, ] <= cutOff)
no_edge <- which(distanceMatrix[i, ] > cutOff)
adjMat[i, edge] <- 1
adjMat[i, no_edge] <- 0
}
rownames(adjMat) <- rownames(distanceMatrix)
colnames(adjMat) <- colnames(distanceMatrix)
return(adjMat)
}
buildGraph <- function(adjMatrix){
g <- igraph::graph_from_adjacency_matrix(
adjMatrix,
mode = "undirected",
add.colnames = NULL,
add.rownames = NA
)
return(simplify(g))
}
adj <- getAdjacencyMatrix(mmDist, 0.3)
g <- buildGraph(adj)
plot(g, layout=layout.random, main="random")
adj <- getAdjacencyMatrix(kappaDist, 0.3)
g <- buildGraph(adj)
plot(g,layout=layout.random, main="random")
In the next step, we cluster the data based on the scoring.
checkInclusion <- function(seeds) {
if(length(seeds) <= 1){
return(seeds)
}
remove <- c()
for (i in 1:(length(seeds) - 1)) {
s1 <- seeds[[i]]
l1 <- length(s1)
for (j in (i + 1):length(seeds)) {
s2 <- seeds[[j]]
l2 <- length(s2)
if (l1 < l2) {
if (length(intersect(s1, s2)) == l1) {
remove <- c(remove, i)
break
}
} else{
if (length(intersect(s2, s1)) == l2) {
remove <- c(remove, j)
}
}
}
}
remove <- unique(remove)
if(length(remove) == 0){
return(seeds)
}else{
return(seeds[-remove])
}
}
seedFinding <- function(distances, simThreshold, memThreshold){
# simthreshold: what is considered 'close' relationship
# memthreshold: how many members of a possible seed need a close relationship for the seed to be considered
seeds <- list()
# Build matrix with xij = 1 indicating i and j are close (sim(i, j) <= simThreshold)
reach <- apply(distances, 1, function(x) as.numeric(x <= simThreshold))
for(i in 1:nrow(distances)){
if(sum(reach[i,], na.rm = TRUE) >= 2){
members <- which(reach[i, ] == 1)
includethreshold <- (length(members)^2 - length(members)) * memThreshold
reach_red <- reach[members, members]
in_reach <- sum(reach_red)
if(in_reach >= includethreshold){
members <- c(members, i)
seeds <- c(list(sort(unique(members))), seeds)
}
}
}
seeds <- checkInclusion(unique(seeds))
return(seeds)
}
clustering <- function(seeds, threshold){
if(length(seeds) <= 1){
return(seeds)
}
mergeable <- rep(TRUE, length(seeds))
while(any(mergeable)){
index <- which(mergeable)[1]
s1 <- seeds[[index]]
l <- length(seeds)
for(j in 1:length(seeds)){
s2 <- seeds[[j]]
int <- intersect(s1, s2)
union <- union(s1, s2)
if(length(int) >= (threshold * length(union))){
remove <- list(s1, s2)
seeds <- seeds[!(seeds %in% remove)]
seeds <- c(list(union), seeds)
mergeable <- mergeable[-c(index, j)]
mergeable <- c(TRUE, mergeable)
break
}
}
if(l == length(seeds)){
mergeable[[index]] <- FALSE
}
}
return(seeds)
}
mmSeeds <- seedFinding(mmDist, 0.3, 0.5)
mmCluster <- clustering(mmSeeds, 0.5)
kappaSeeds <- seedFinding(kappaDist, 0.3, 0.5)
kappaCluster <- clustering(kappaSeeds, 0.5)
getAdjacencyMatrixCluster <- function(cluster, l){
adj <- Matrix::Matrix(0, l, l)
for(i in cluster){
nodes <- unlist(i)
li <- length(nodes)
for(j in 1:(li - 1)){
a <- nodes[j]
for(k in (j + 1):li){
b <- nodes[k]
adj[a, b] <- adj[b, a] <- 1
}
}
}
return(adj)
}
mmClusterAdj <- getAdjacencyMatrixCluster(mmCluster, nrow(mmDist))
mmClustGraph <- buildGraph(mmClusterAdj)
plot(mmClustGraph, layout=layout.random, main="random")
kappaClusterAdj <- getAdjacencyMatrixCluster(kappaCluster, nrow(kappaDist))
kappaClustGraph <- buildGraph(kappaClusterAdj)
plot(kappaClustGraph, layout=layout.random, main="random")
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] STRINGdb_2.8.4 BiocFileCache_2.4.0 dbplyr_2.1.1
## [4] taxize_0.9.100 pheatmap_1.0.12 dplyr_1.0.9
## [7] Matrix_1.4-1 DT_0.23 igraph_1.3.1
## [10] knitr_1.39
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-157 bitops_1.0-7 bold_1.2.0
## [4] bit64_4.0.5 filelock_1.0.2 RColorBrewer_1.1-3
## [7] httr_1.4.3 tools_4.2.0 bslib_0.3.1
## [10] utf8_1.2.2 R6_2.5.1 KernSmooth_2.23-20
## [13] DBI_1.1.2 colorspace_2.0-3 tidyselect_1.1.2
## [16] bit_4.0.4 curl_4.3.2 compiler_4.2.0
## [19] chron_2.3-57 cli_3.3.0 xml2_1.3.3
## [22] sass_0.4.1 caTools_1.18.2 scales_1.2.0
## [25] rappdirs_0.3.3 stringr_1.4.0 digest_0.6.29
## [28] rmarkdown_2.14 pkgconfig_2.0.3 htmltools_0.5.2
## [31] plotrix_3.8-2 fastmap_1.1.0 highr_0.9
## [34] htmlwidgets_1.5.4 rlang_1.0.2 rstudioapi_0.13
## [37] httpcode_0.3.0 RSQLite_2.2.14 jquerylib_0.1.4
## [40] generics_0.1.2 zoo_1.8-10 jsonlite_1.8.0
## [43] crosstalk_1.2.0 gtools_3.9.2.1 RCurl_1.98-1.6
## [46] magrittr_2.0.3 Rcpp_1.0.8.3 munsell_0.5.0
## [49] fansi_1.0.3 ape_5.6-2 proto_1.0.0
## [52] lifecycle_1.0.1 sqldf_0.4-11 stringi_1.7.6
## [55] yaml_2.3.5 gplots_3.1.3 plyr_1.8.7
## [58] grid_4.2.0 blob_1.2.3 parallel_4.2.0
## [61] crayon_1.5.1 lattice_0.20-45 conditionz_0.1.0
## [64] hash_2.2.6.2 pillar_1.7.0 uuid_1.1-0
## [67] codetools_0.2-18 crul_1.2.0 glue_1.6.2
## [70] evaluate_0.15 data.table_1.14.2 BiocManager_1.30.18
## [73] vctrs_0.4.1 png_0.1-7 foreach_1.5.2
## [76] gtable_0.3.0 purrr_0.3.4 reshape_0.8.9
## [79] assertthat_0.2.1 gsubfn_0.7 cachem_1.0.6
## [82] xfun_0.31 tibble_3.1.7 iterators_1.0.14
## [85] memoise_2.0.1 ellipsis_0.3.2 BiocStyle_2.24.0